import pandas as pd
import altair as alt
import numpy as np
#read in data
data = pd.read_csv('ds4200_filtered_data2.csv')
data.head()
| Unnamed: 0.1 | Unnamed: 0 | case_enquiry_id | year | case_title | neighborhood | |
|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 101003148628 | 2020 | Tree Maintenance Requests | Hyde Park |
| 1 | 1 | 1 | 101003148656 | 2020 | Tree Maintenance Requests | Charlestown |
| 2 | 2 | 2 | 101003149796 | 2020 | Tree Maintenance Requests | Allston / Brighton |
| 3 | 3 | 3 | 101003149821 | 2020 | Tree Maintenance Requests | Brighton |
| 4 | 4 | 4 | 101003149973 | 2020 | New Tree Requests | Charlestown |
# check that all the columns are correct
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 91236 entries, 0 to 91235 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0.1 91236 non-null int64 1 Unnamed: 0 91236 non-null int64 2 case_enquiry_id 91236 non-null int64 3 year 91236 non-null int64 4 case_title 91236 non-null object 5 neighborhood 91236 non-null object dtypes: int64(4), object(2) memory usage: 4.2+ MB
# check for any null values in case_title column
data['case_title'].value_counts(dropna = False)
Tree Maintenance Requests 56188 New Tree Requests 19212 Tree Emergencies 15836 Name: case_title, dtype: int64
# check for null values in neighborhood column
data['neighborhood'].value_counts(dropna = False)
Dorchester 12863 West Roxbury 9751 South End 6573 Roxbury 6334 South Boston / South Boston Waterfront 5994 Jamaica Plain 5973 East Boston 5673 Roslindale 5293 Allston / Brighton 5276 Hyde Park 4756 Greater Mattapan 4393 Charlestown 4238 Back Bay 3264 Fenway / Kenmore / Audubon Circle / Longwood 2182 Boston 2097 Beacon Hill 2080 Downtown / Financial District 1906 Mission Hill 1365 South Boston 553 Brighton 408 Allston 145 Mattapan 117 Chestnut Hill 2 Name: neighborhood, dtype: int64
# replace empty string values with null
data['neighborhood'] = data['neighborhood'].replace(' ', np.nan)
# remove any rows with null values
data = data.dropna()
# check that null values have been removed
data['neighborhood'].value_counts(dropna = False)
Dorchester 12863 West Roxbury 9751 South End 6573 Roxbury 6334 South Boston / South Boston Waterfront 5994 Jamaica Plain 5973 East Boston 5673 Roslindale 5293 Allston / Brighton 5276 Hyde Park 4756 Greater Mattapan 4393 Charlestown 4238 Back Bay 3264 Fenway / Kenmore / Audubon Circle / Longwood 2182 Boston 2097 Beacon Hill 2080 Downtown / Financial District 1906 Mission Hill 1365 South Boston 553 Brighton 408 Allston 145 Mattapan 117 Chestnut Hill 2 Name: neighborhood, dtype: int64
# merge South Boston and South Boston/South Boston Waterfront rows by changing South Boston/South Boston Waterfront
# to South Boston and make Allston/Brighton into Brighton
data['neighborhood'] = data['neighborhood'].replace('South Boston / South Boston Waterfront', 'South Boston')
data['neighborhood'] = data['neighborhood'].replace('Allston / Brighton', 'Brighton')
# check that the changes have been made
data['neighborhood'].value_counts(dropna = False)
Dorchester 12863 West Roxbury 9751 South End 6573 South Boston 6547 Roxbury 6334 Jamaica Plain 5973 Brighton 5684 East Boston 5673 Roslindale 5293 Hyde Park 4756 Greater Mattapan 4393 Charlestown 4238 Back Bay 3264 Fenway / Kenmore / Audubon Circle / Longwood 2182 Boston 2097 Beacon Hill 2080 Downtown / Financial District 1906 Mission Hill 1365 Allston 145 Mattapan 117 Chestnut Hill 2 Name: neighborhood, dtype: int64